DATA

  1. Id : Unique identifier for each observation.
  2. AB-GL : Fifty-six anonymized health characteristics. All are numeric except for EJ, which is categorical.
  3. Class : 1 indicates the subject has been diagnosed with one of the three conditions, 0 indicates they have not.
  4. Alpha : Identifies the type of age-related condition, if present.
    A : No age-related condition. Corresponds to class 0.
    B, D, G : The three age-related conditions. Correspond to class 1.
##             Id       AB        AF        AH        AM       AR       AX
## 1 000ff2bfdfe9 0.209377 3109.0333  85.20015 22.394407 8.138688 0.699861
## 2 007255e47698 0.145282  978.7642  85.20015 36.968889 8.138688 3.632190
## 3 013f2bd269f5 0.470030 2635.1065  85.20015 32.360553 8.138688 6.732840
## 4 043ac50845d5 0.252107 3819.6518 120.20162 77.112203 8.138688 3.685344
## 5 044fb8a146ec 0.380297 3733.0484  85.20015 14.103738 8.138688 3.942255
## 6 04517a3c90bd 0.209377 2615.8143  85.20015  8.541526 8.138688 4.013127
##         AY        AZ         BC       BD      BN       BP        BQ         BR
## 1 0.025578  9.812214   5.555634 4126.587 22.5984 175.6387 152.70771  823.92824
## 2 0.025578 13.517790   1.229900 5496.928 19.4205 155.8680  14.75472   51.21688
## 3 0.025578 12.824570   1.229900 5135.780 26.4825 128.9885 219.32016  482.14159
## 4 0.025578 11.053708   1.229900 4169.677 23.6577 237.2823  11.05041  661.51864
## 5 0.054810  3.396778 102.151980 5728.734 24.0108 324.5463 149.71716 6074.85948
## 6 0.025578 12.547282   1.229900 5237.541 10.2399 148.4879  16.52612  642.32516
##         BZ       CB        CC       CD        CF       CH       CL       CR
## 1 257.4324 47.22336 0.5634806 23.38760  4.851915 0.023482 1.050225 0.069225
## 2 257.4324 30.28434 0.4847101 50.62821  6.085041 0.031442 1.113875 1.117800
## 3 257.4324 32.56371 0.4958516 85.95538  5.376488 0.036218 1.050225 0.700350
## 4 257.4324 15.20191 0.7178821 88.15936  2.347652 0.029054 1.400300 0.636075
## 5 257.4324 82.21349 0.5364670 72.64426 30.537722 0.025472 1.050225 0.693150
## 6 257.4324 18.38200 0.6394604 80.66740 14.688030 0.016716 1.050225 0.857625
##         CS       CU       CW       DA       DE      DF       DH        DI
## 1 13.78411 1.302012 36.20596 69.08340 295.5706 0.23868 0.284232  89.24556
## 2 28.31095 1.357182 37.47657 70.79836 178.5531 0.23868 0.363489 110.58182
## 3 39.36474 1.009611 21.45964 70.81970 321.4266 0.23868 0.210441 120.05644
## 4 41.11696 0.722727 21.53039 47.27586 196.6080 0.23868 0.292431 139.82457
## 5 31.72473 0.827550 34.41536 74.06532 200.1782 0.23868 0.207708  97.92012
## 6 32.45700 1.390284  7.03064 55.22404 135.4892 0.23868 0.478275 135.31787
##         DL       DN        DU      DV        DY       EB       EE        EG
## 1 84.31664 29.65710 5.3106900 1.74307 23.187704 7.294176 1.987283  1433.167
## 2 75.74548 37.53200 0.0055176 1.74307 17.222328 4.926396 0.858603  1111.287
## 3 65.46984 28.05346 1.2897390 1.74307 36.861352 7.813674 8.146651  1494.076
## 4 71.57120 24.35486 2.6553450 1.74307 52.003884 7.386060 3.813326 15691.552
## 5 52.83888 26.01991 1.1449020 1.74307  9.064856 7.350720 3.490846  1403.656
## 6 81.46312 31.73160 0.0055176 1.74307 16.773128 4.926396 2.394414   866.383
##         EH EJ        EL       EP        EU        FC        FD        FE
## 1 0.949104  B  30.87942 78.52697  3.828384  13.39464 10.265073  9028.292
## 2 0.003042  A 109.12516 95.41509 52.260480  17.17598  0.296850  6785.003
## 3 0.377208  B 109.12516 78.52697  5.390628 224.20742  8.745201  8338.906
## 4 0.614484  B  31.67436 78.52697 31.323372  59.30198  7.884336 10965.766
## 5 0.164268  B 109.12516 91.99483 51.141336  29.10264  4.274640 16198.050
## 6 0.003042  A 109.12516 78.52697  3.828384  23.30496  0.296850  8517.279
##         FI       FL       FR       FS        GB        GE        GF       GH
## 1  3.58345 7.298162  1.73855 0.094822 11.339138  72.61106  2003.810 22.13623
## 2 10.35893 0.173229  0.49706 0.568932  9.292698  72.61106 27981.563 29.13543
## 3 11.62692 7.709560  0.97556 1.198821 37.077772  88.60944 13676.958 28.02285
## 4 14.85202 6.122162  0.49706 0.284466 18.529584  82.41680  2094.262 39.94866
## 5 13.66673 8.153058 48.50134 0.121914 16.408728 146.10994  8524.371 45.38132
## 6 10.98190 0.173229  0.49706 1.164956 21.915512  72.61106 24177.596 28.52519
##         GI          GL Class Alpha
## 1 69.83494  0.12034286     1     B
## 2 32.13200 21.97800000     0     A
## 3 35.19268  0.19694118     0     A
## 4 90.49325  0.15582857     0     A
## 5 36.26263  0.09661446     1     D
## 6 82.52776 21.97800000     0     A
summary(data) 
##       Id                  AB                AF                AH        
##  Length:617         Min.   :0.08119   Min.   :  192.6   Min.   :  85.2  
##  Class :character   1st Qu.:0.25211   1st Qu.: 2197.3   1st Qu.:  85.2  
##  Mode  :character   Median :0.35466   Median : 3120.3   Median :  85.2  
##                     Mean   :0.47715   Mean   : 3502.0   Mean   : 118.6  
##                     3rd Qu.:0.55976   3rd Qu.: 4361.6   3rd Qu.: 113.7  
##                     Max.   :6.16167   Max.   :28688.2   Max.   :1910.1  
##                                                                         
##        AM                AR                AX                AY          
##  Min.   :  3.178   Min.   :  8.139   Min.   : 0.6999   Min.   : 0.02558  
##  1st Qu.: 12.270   1st Qu.:  8.139   1st Qu.: 4.1283   1st Qu.: 0.02558  
##  Median : 20.533   Median :  8.139   Median : 5.0319   Median : 0.02558  
##  Mean   : 38.969   Mean   : 10.128   Mean   : 5.5456   Mean   : 0.06032  
##  3rd Qu.: 39.140   3rd Qu.:  8.139   3rd Qu.: 6.4316   3rd Qu.: 0.03684  
##  Max.   :630.518   Max.   :178.944   Max.   :38.2709   Max.   :10.31585  
##                                                                          
##        AZ               BC                 BD              BN        
##  Min.   : 3.397   Min.   :   1.230   Min.   : 1694   Min.   : 9.887  
##  1st Qu.: 8.130   1st Qu.:   1.230   1st Qu.: 4156   1st Qu.:19.421  
##  Median :10.461   Median :   1.230   Median : 4998   Median :21.186  
##  Mean   :10.566   Mean   :   8.053   Mean   : 5350   Mean   :21.419  
##  3rd Qu.:12.970   3rd Qu.:   5.081   3rd Qu.: 6036   3rd Qu.:23.658  
##  Max.   :38.972   Max.   :1463.693   Max.   :53061   Max.   :29.307  
##                                                                      
##        BP                BQ                BR                  BZ         
##  Min.   :  72.95   Min.   :  1.331   Min.   :    51.22   Min.   :  257.4  
##  1st Qu.: 156.85   1st Qu.: 27.834   1st Qu.:   424.99   1st Qu.:  257.4  
##  Median : 193.91   Median : 61.642   Median :   627.42   Median :  257.4  
##  Mean   : 231.32   Mean   : 98.329   Mean   :  1218.13   Mean   :  550.6  
##  3rd Qu.: 247.80   3rd Qu.:134.009   3rd Qu.:   975.65   3rd Qu.:  257.4  
##  Max.   :2447.81   Max.   :344.644   Max.   :179250.25   Max.   :50092.5  
##                    NA's   :60                                             
##        CB                CC               CD               CF          
##  Min.   :  12.50   Min.   :0.1769   Min.   : 23.39   Min.   :  0.5109  
##  1st Qu.:  23.32   1st Qu.:0.5637   1st Qu.: 64.72   1st Qu.:  5.0663  
##  Median :  42.55   Median :0.6587   Median : 79.82   Median :  9.1230  
##  Mean   :  77.10   Mean   :0.6888   Mean   : 90.25   Mean   : 11.2411  
##  3rd Qu.:  77.31   3rd Qu.:0.7722   3rd Qu.: 99.81   3rd Qu.: 13.5659  
##  Max.   :2271.44   Max.   :4.1030   Max.   :633.53   Max.   :200.9675  
##  NA's   :2         NA's   :3                                           
##        CH                 CL               CR                CS        
##  Min.   :0.003184   Min.   : 1.050   Min.   :0.06922   Min.   : 13.78  
##  1st Qu.:0.023482   1st Qu.: 1.050   1st Qu.:0.58957   1st Qu.: 29.78  
##  Median :0.027860   Median : 1.050   Median :0.73080   Median : 34.84  
##  Mean   :0.030615   Mean   : 1.404   Mean   :0.74226   Mean   : 36.92  
##  3rd Qu.:0.034427   3rd Qu.: 1.228   3rd Qu.:0.85935   3rd Qu.: 40.53  
##  Max.   :0.224074   Max.   :31.688   Max.   :3.03967   Max.   :267.94  
##                                                                        
##        CU               CW               DA                DE        
##  Min.   :0.1379   Min.   : 7.031   Min.   :  6.906   Min.   :  36.0  
##  1st Qu.:1.0703   1st Qu.: 7.031   1st Qu.: 37.943   1st Qu.: 188.8  
##  Median :1.3517   Median :36.019   Median : 49.181   Median : 307.5  
##  Mean   :1.3838   Mean   :27.166   Mean   : 51.128   Mean   : 401.9  
##  3rd Qu.:1.6606   3rd Qu.:37.936   3rd Qu.: 61.409   3rd Qu.: 507.9  
##  Max.   :4.9515   Max.   :64.522   Max.   :210.331   Max.   :2103.4  
##                                                                      
##        DF                DH                DI                DL        
##  Min.   : 0.2387   Min.   :0.04099   Min.   :  60.23   Min.   : 10.35  
##  1st Qu.: 0.2387   1st Qu.:0.29516   1st Qu.: 102.70   1st Qu.: 78.23  
##  Median : 0.2387   Median :0.35802   Median : 130.05   Median : 96.27  
##  Mean   : 0.6339   Mean   :0.36700   Mean   : 146.97   Mean   : 94.80  
##  3rd Qu.: 0.2387   3rd Qu.:0.42635   3rd Qu.: 165.84   3rd Qu.:110.64  
##  Max.   :37.8950   Max.   :1.06040   Max.   :1049.17   Max.   :326.24  
##                                                                        
##        DN              DU                  DV               DY          
##  Min.   : 6.34   Min.   :  0.00552   Min.   : 1.743   Min.   :  0.8041  
##  1st Qu.:20.89   1st Qu.:  0.00552   1st Qu.: 1.743   1st Qu.: 14.7158  
##  Median :25.25   Median :  0.25174   Median : 1.743   Median : 21.6425  
##  Mean   :26.37   Mean   :  1.80290   Mean   : 1.925   Mean   : 26.3890  
##  3rd Qu.:30.54   3rd Qu.:  1.05869   3rd Qu.: 1.743   3rd Qu.: 34.0583  
##  Max.   :62.81   Max.   :161.35531   Max.   :25.193   Max.   :152.3552  
##                  NA's   :1                                              
##        EB               EE                EG                EH          
##  Min.   : 4.926   Min.   : 0.2862   Min.   :  185.6   Min.   : 0.00304  
##  1st Qu.: 5.965   1st Qu.: 1.6487   1st Qu.: 1111.2   1st Qu.: 0.00304  
##  Median : 8.149   Median : 2.6161   Median : 1493.8   Median : 0.08518  
##  Mean   : 9.073   Mean   : 3.0648   Mean   : 1731.2   Mean   : 0.30511  
##  3rd Qu.:10.503   3rd Qu.: 3.9101   3rd Qu.: 1905.7   3rd Qu.: 0.23728  
##  Max.   :94.959   Max.   :18.3249   Max.   :30243.8   Max.   :42.56975  
##                                                                         
##       EJ                  EL                EP                EU          
##  Length:617         Min.   :  5.395   Min.   :  78.53   Min.   :   3.828  
##  Class :character   1st Qu.: 30.927   1st Qu.:  78.53   1st Qu.:   4.325  
##  Mode  :character   Median : 71.949   Median :  78.53   Median :  22.641  
##                     Mean   : 69.583   Mean   : 105.06   Mean   :  69.117  
##                     3rd Qu.:109.125   3rd Qu.: 112.77   3rd Qu.:  49.085  
##                     Max.   :109.125   Max.   :1063.59   Max.   :6501.264  
##                     NA's   :60                                            
##        FC                 FD                  FE               FI        
##  Min.   :   7.534   Min.   :   0.2968   Min.   :  1563   Min.   : 3.583  
##  1st Qu.:  25.815   1st Qu.:   0.2968   1st Qu.:  5165   1st Qu.: 8.523  
##  Median :  36.394   Median :   1.8702   Median :  7345   Median : 9.945  
##  Mean   :  71.341   Mean   :   6.9301   Mean   : 10307   Mean   :10.111  
##  3rd Qu.:  56.714   3rd Qu.:   4.8802   3rd Qu.: 10648   3rd Qu.:11.517  
##  Max.   :3030.656   Max.   :1578.6542   Max.   :143225   Max.   :35.851  
##  NA's   :1                                                               
##        FL                 FR                  FS                 GB         
##  Min.   :  0.1732   Min.   :   0.4971   Min.   : 0.06773   Min.   :  4.102  
##  1st Qu.:  0.1732   1st Qu.:   0.4971   1st Qu.: 0.06773   1st Qu.: 14.037  
##  Median :  3.0281   Median :   1.1310   Median : 0.25060   Median : 18.771  
##  Mean   :  5.4332   Mean   :   3.5339   Mean   : 0.42150   Mean   : 20.725  
##  3rd Qu.:  6.2388   3rd Qu.:   1.5121   3rd Qu.: 0.53507   3rd Qu.: 25.608  
##  Max.   :137.9327   Max.   :1244.2270   Max.   :31.36576   Max.   :135.781  
##  NA's   :1                              NA's   :2                           
##        GE                GF                  GH               GI          
##  Min.   :  72.61   Min.   :    13.04   Min.   : 9.433   Min.   :  0.8976  
##  1st Qu.:  72.61   1st Qu.:  2798.99   1st Qu.:25.035   1st Qu.: 23.0117  
##  Median :  72.61   Median :  7838.27   Median :30.609   Median : 41.0080  
##  Mean   : 131.72   Mean   : 14679.60   Mean   :31.490   Mean   : 50.5844  
##  3rd Qu.: 127.59   3rd Qu.: 19035.71   3rd Qu.:36.864   3rd Qu.: 67.9317  
##  Max.   :1497.35   Max.   :143790.07   Max.   :81.211   Max.   :191.1948  
##                                                                           
##        GL                Class          Alpha          
##  Min.   : 0.001129   Min.   :0.000   Length:617        
##  1st Qu.: 0.124392   1st Qu.:0.000   Class :character  
##  Median : 0.337827   Median :0.000   Mode  :character  
##  Mean   : 8.530961   Mean   :0.175                     
##  3rd Qu.:21.978000   3rd Qu.:0.000                     
##  Max.   :21.978000   Max.   :1.000                     
##  NA's   :1
data_table <- table(data$Class);data_table
## 
##   0   1 
## 509 108
pie(data_table,main = "data$Class")

data_table <- table(data$Alpha);data_table
## 
##   A   B   D   G 
## 509  61  18  29
pie(data_table,main = "data$Alpha")

資料清洗

  1. 缺失值處理,例如null & na.
    各變數缺失值數量>95% : 可以直接刪除該變數.
    各變數缺失值數量<95% : 除了手動檢查原始資料外,還需考慮以哪種方式填補;例如偏態變數以中位數填補,正態以平均數填補或以線性迴歸計算缺失值.
  2. 重複值處理
  3. 噪音處理 : 常見方法有分箱法、回歸法與聚類法,藉以找出極端值.
  4. 特徵挑選
#1資料清洗
#1-1.缺失值處理,例如null.na....
n <- length(which(is.na(data)))+length(which(is.null(data))) 
#各變數缺失值數量>95%可以直接刪除該變數.
#各變數缺失值數量<95%需考慮以哪種方式填補,除了手動檢查原始資料外,例如偏態變數以中位數填補,正態以平均數填補或以線性迴歸計算缺失值.
if(n!=0){
  seat <- c() # seat:含缺失值變數位置
  na <- c()  # na:變數缺失值數量>95%變數位置
  for (i in 1:length(data[1,])) {
    if(length(which(is.na(data[,i])))+length(which(is.null(data[,i])))!=0){
      seat <- c(seat,i)
    }
  }
  for (i in seat) {
    if((length(which(is.na(data[,i])))+length(which(is.null(data[,i]))))/length(data[,i])>0.95){
      na <- c(na,i)
    }
  }
  if(is.null(na)=='FALSE'){
    data <- data[ ,-na]}
}

#1-2.1 重複值處理
n <- length(which(duplicated(data)=='TRUE')) #重複值數量
if(n!=0){
  data <- data[!duplicated(data), ]
}

#1-2.2 null.na值處理,直接刪除或以中位數或均值替補.
#方法1.直接刪除
#方法2.偏態以中位數替補
#用直方圖與盒鬚圖辨別辨別分佈型態
par(mfrow=c(2,4))
for(i in 1:length(seat)){
  hist(data[,seat[i]],freq = TRUE,
       main = paste("Histogram of" , colnames(data)[seat[i]]),
       xlab = colnames(data)[seat[i]])
  boxplot(data[,seat[i]],
          main = colnames(data)[seat[i]],
          xlab = "time",outcol="red")
}

## EL : 正態,以均值替代, MEAN =  69.5826
## BQ:偏態,以中位數替代, median =  61.64211
## CB:偏態,以中位數替代, median =  42.55433
## CC:偏態,以中位數替代, median =  0.6587155
## DU:偏態,以中位數替代, median =  0.2517405
## FC:偏態,以中位數替代, median =  36.39401
## FL:偏態,以中位數替代, median =  3.028141
## FS:偏態,以中位數替代, median =  0.250601
## GL:偏態,以中位數替代, median =  0.3378275
#盒鬚圖,噪音處理;
par(mfrow=c(3,5))
for(i in c(2:40,42:(length(data)-2))){
  boxplot(data[,i],
          main=colnames(data)[i],
          xlab = "time",outcol="red")
}

## 共刪除 21 筆資料.
#Correlation : 考慮適度刪除高度相關的變數.
par(mfrow=c(1,1))
correlation <- cor(data[,-c(1,41,58,59)]) #Id與離散型變數
corrplot(correlation)

##   row col       cor
## 1  FD  EH 0.9713156
## 2  DV  CL 0.9157253
## 3  BZ  BC 0.9092153
## 4  EH  DU 0.8509647
## 5  FD  DU 0.8062627
## 6  BD  BC 0.7593433
## 7  EP  CS 0.7164126
## 8  DV  AR 0.7090023
#特徵重要程度排序
# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
model <- train(Alpha~., data=data[,-c(1,41,58)], method="lvq", preProcess="scale", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## ROC curve variable importance
## 
##   variables are sorted by maximum importance across the classes
##   only 20 most important variables shown (out of 55)
## 
##         A      B      D      G
## DU 0.9343 0.9343 0.9343 0.6530
## CR 0.7914 0.9212 0.6638 0.9212
## FL 0.9031 0.9031 0.9031 0.6347
## AB 0.7837 0.8996 0.7016 0.8996
## GL 0.8798 0.8798 0.8798 0.6275
## EH 0.8427 0.8427 0.8793 0.6867
## DA 0.6683 0.8727 0.6111 0.8727
## BC 0.8725 0.7301 0.8534 0.8725
## FD 0.8203 0.8203 0.8602 0.6919
## FR 0.8378 0.6924 0.7538 0.8378
## AF 0.7118 0.8329 0.6249 0.8329
## DI 0.6341 0.8194 0.5811 0.8194
## DE 0.7531 0.8102 0.6916 0.8102
## FI 0.5598 0.7990 0.5795 0.7990
## DF 0.5642 0.7937 0.5642 0.7937
## EE 0.7584 0.7930 0.7304 0.7930
## BQ 0.7843 0.7047 0.7047 0.7843
## CD 0.7414 0.7818 0.7011 0.7818
## GH 0.7064 0.5520 0.7366 0.7064
## DL 0.5993 0.7346 0.5556 0.7346
plot(importance,10)

## 刪除 BZ CS DV EH FD ,共5個變數.
data <- data[,-c(16,24,35,40,46)] #BZ,CS,DV,EH,FD    

建模與預測

  1. 取7成資料為train,其餘為test.
  2. 建模
  3. 預測

建模與預測

  • 多元羅吉斯迴歸(glm) : ltm package
    • family:
      1.若y是連續值,設”gaussian”,
      2.若y是二元分類,設”binomial”,
      3.若y是多元分類,使用package:VGAM(“vglm”)或nnet(“multinom”).
## # weights:  212 (156 variable)
## initial  value 578.084749 
## iter  10 value 330.706030
## iter  20 value 274.979321
## iter  30 value 224.771728
## iter  40 value 193.611703
## iter  50 value 174.534572
## iter  60 value 162.813204
## iter  70 value 155.875361
## iter  80 value 150.189941
## iter  90 value 142.818521
## iter 100 value 113.508791
## final  value 113.508791 
## stopped after 100 iterations
## Call:
## multinom(formula = data_train$Alpha ~ ., data = data_train)
## 
## Coefficients:
##   (Intercept)          AB            AF           AH           AM         AR
## B  -0.1509263 -0.05286289 -3.984784e-05 -0.006558726  0.002153208 0.09630950
## D  -0.1774341 -0.06380836  7.177588e-06 -0.007001501  0.007995486 0.12495040
## G   0.1021002  0.42490839 -2.342680e-05  0.011971464 -0.010061900 0.02018948
##            AX          AY          AZ           BC            BD         BN
## B -0.11605973 -0.30520175  0.03238611  0.002643015  3.273055e-05 0.13180853
## D  0.03951675  1.30788374 -0.01658032  0.019419764 -4.503123e-04 0.04353421
## G  0.05490295 -0.06134055 -0.18619806 -0.012539455  1.005106e-04 0.11633918
##              BP           BQ            BR            CB         CC
## B -0.0010812087  0.003484354 -3.739151e-04 -0.0027672077 -0.4082003
## D  0.0022273838  0.006389577 -1.121811e-04 -0.0001344132 -0.8836127
## G  0.0007179518 -0.000190816  3.173996e-05 -0.0036031623 -0.8385099
##             CD           CF            CH         CL          CR         CU
## B  0.003169193  0.008296113 -0.0395547420  0.2647348 -0.06831331 -0.3152782
## D  0.007854072 -0.030298887 -0.0006786629  0.2231666 -0.78789642 -1.1756263
## G -0.002331321  0.083880148 -0.0007554174 -0.4513234 -0.75911773 -0.2595503
##             CW          DA            DE         DF          DH          DI
## B -0.001437563  0.01267926  3.553787e-04 -0.2997375 -0.18902277 0.002718051
## D  0.035021492  0.00211708  8.620886e-05 -2.3820817 -0.09123155 0.001621095
## G  0.006931829 -0.05503352 -1.609599e-03  0.4326768  0.17098995 0.008273105
##              DL          DN         DU           DY          EB           EE
## B -0.0004398532 -0.09295295 0.29896058  0.002957612 0.006876652 -0.173571731
## D  0.0010361573 -0.11674280 0.04133997 -0.004673100 0.095023436 -0.433648791
## G -0.0025769470  0.09077390 0.34293694  0.027435079 0.245666316 -0.003789343
##              EG         EJB            EL          EP            EU
## B -8.135594e-05  0.05189739  0.0055444263 -0.01467658 -0.0005669425
## D  2.356909e-05 -0.25713409 -0.0008464824 -0.01297510  0.0008148422
## G  1.334416e-05  0.33212133  0.0150112430 -0.02511732  0.0014864501
##              FC            FE         FI           FL         FR          FS
## B -0.0001778567  2.282627e-05 -0.1798275  0.007594089 0.01797391 -0.09011913
## D -0.0004515410 -1.064647e-05 -0.1007670  0.068963389 0.01940918 -1.11489407
## G -0.0009704073 -4.055067e-05 -0.1140404 -0.115597639 0.01992038  0.57430461
##            GB            GE            GF          GH           GI          GL
## B  0.02170784 -0.0007889106 -1.152557e-05 -0.01435691  0.001946791 -0.07076732
## D  0.05731336  0.0003047149 -1.768222e-05  0.08943205  0.003119980  0.05614217
## G -0.08151857 -0.0147663588 -3.638359e-04 -0.05750531 -0.001335923  0.12668714
## 
## Std. Errors:
##    (Intercept)           AB           AF          AH          AM         AR
## B 0.0009595450 0.0007977852 0.0001289596 0.004665557 0.004185206 0.03066783
## D 0.0007636351 0.0008567020 0.0001496734 0.003971592 0.004025165 0.04999310
## G 0.0003890734 0.0005590610 0.0001842735 0.005775038 0.007039822 0.02633763
##            AX           AY         AZ          BC           BD         BN
## B 0.005049923 4.238524e-05 0.04631733 0.007742616 0.0001654001 0.04906807
## D 0.006234000 1.500348e-04 0.04541586 0.007498300 0.0001847551 0.04549537
## G 0.003716602 3.190274e-05 0.01411455 0.011408550 0.0003009478 0.01295311
##            BP          BQ           BR          CB           CC          CD
## B 0.002285443 0.002848306 3.951666e-04 0.003717429 0.0002131052 0.005407359
## D 0.001169330 0.003319572 2.561693e-04 0.001240126 0.0004015659 0.004672981
## G 0.003909154 0.004574361 2.402842e-05 0.005295771 0.0001931354 0.007308641
##           CF           CH          CL           CR           CU         CW
## B 0.02140414 3.398051e-05 0.004604977 0.0004479379 0.0014710412 0.01558086
## D 0.01720676 2.126635e-05 0.006999834 0.0011507012 0.0009778794 0.01888989
## G 0.02999860 2.561722e-05 0.001882607 0.0006354181 0.0011379661 0.02810639
##           DA           DE           DF           DH          DI          DL
## B 0.01339479 0.0006334966 0.0093223158 0.0002350990 0.004141124 0.010229622
## D 0.01256904 0.0009307666 0.0008353959 0.0002152910 0.005486366 0.009992021
## G 0.02613008 0.0015980419 0.0068219715 0.0002568505 0.005627866 0.017594902
##           DN          DU         DY          EB          EE           EG
## B 0.03467783 0.030558215 0.01428548 0.011549862 0.004738628 0.0001535051
## D 0.04207503 0.009415026 0.01485759 0.009975388 0.006691007 0.0001225930
## G 0.04732775 0.018966608 0.01670585 0.010628829 0.003824033 0.0002051333
##           EJB          EL          EP          EU          FC           FE
## B 0.002084692 0.006477416 0.008407475 0.002866428 0.002624103 1.421014e-05
## D 0.001637421 0.008083691 0.006414451 0.000569322 0.002251986 2.442989e-05
## G 0.001797366 0.011740904 0.008121760 0.002290208 0.002048985 5.009113e-05
##            FI         FL         FR           FS         GB          GE
## B 0.004653873 0.02462051 0.01161679 0.0009891375 0.02529570 0.002662993
## D 0.004735451 0.03076549 0.01161348 0.0004308318 0.02682456 0.001797290
## G 0.006809821 0.03554099 0.01178472 0.0061626713 0.05266481 0.008954139
##             GF         GH          GI         GL
## B 2.026987e-05 0.02593350 0.007282057 0.04104318
## D 1.701728e-05 0.02755057 0.008452051 0.03305564
## G 6.489192e-05 0.04232063 0.013910629 0.04078302
## 
## Residual Deviance: 227.0176 
## AIC: 539.0176
##    data_test_y
## pre   A   B   D   G
##   A 130   6   1   5
##   B   6  12   0   0
##   D   6   1   2   1
##   G   8   0   1   0
## accuracy_mul_glm =  0.9220779
  • 挑選變數:根據 AIC 或 BIC指標,越小越好,選取一個表現最佳的模型!
    1. Stepwise:Backward Elimination
    2. Stepwise:Forward Selection
    3. both
summary(backward)
## Call:
## multinom(formula = data_train$Alpha ~ AF + AZ + BN + BP + BQ + 
##     CB + CD + CR + CU + CW + DF + DH + DL + DN + DU + EB + EE + 
##     EG + EJ + EP + FE + FL + FR + GE + GH + GI, data = data_train)
## 
## Coefficients:
##   (Intercept)            AF         AZ          BN           BP          BQ
## B  -31.452804 -0.0010829100  0.4141326  1.00380517  0.005478383 0.047499930
## D  -36.451935 -0.0009266004  0.7378708 -0.01550763  0.011416847 0.015608568
## G    3.255081  0.0023403568 -0.2435613  1.73438010 -0.007469210 0.006884419
##            CB         CD        CR        CU          CW         DF         DH
## B  0.01104571 0.04923306 -13.69959  1.674493 -0.04118839 -0.6420895   1.898692
## D -0.03482011 0.11034681 -23.64157 -2.940817  0.54750881 -5.8496600  39.668351
## G -0.02718327 0.07670444 -29.69885  4.989313 -0.07670047  2.6285259 -28.390525
##            DL          DN        DU          EB        EE            EG
## B  0.04035556 -0.74083511 1.0810919  0.91363853 -1.300446 -0.0051604095
## D -0.23193384 -0.07867187 0.8138584 -0.01761589 -4.782734 -0.0018266107
## G -0.05373887 -0.37679842 1.1281709  0.47839354 -6.039630 -0.0008223286
##          EJB          EP            FE         FL       FR          GE
## B 24.0509945 -0.17361419  0.0002176354  0.1319665 1.205454 -0.02304689
## D  0.6950821  0.04359718 -0.0001240826 -0.1213791 2.296990  0.02213737
## G -1.6931897 -0.05901653  0.0001456703 -1.1748543 1.113995 -0.06748207
##           GH          GI
## B 0.16333959 -0.01446484
## D 0.46472686  0.10452554
## G 0.07269803 -0.24078644
## 
## Std. Errors:
##   (Intercept)           AF         AZ         BN          BP          BQ
## B 0.003033419 0.0002067765 0.10250886 0.10604466 0.003127525 0.005593778
## D 0.001814102 0.0005766500 0.13908324 0.09631166 0.004649273 0.010453270
## G 0.001394301 0.0008964249 0.03594126 0.04231612 0.010433913 0.014872312
##            CB          CD           CR          CU         CW          DF
## B 0.003782845 0.012060344 0.0030747819 0.003814159 0.03276126 0.027786955
## D 0.018301064 0.009974369 0.0031536335 0.004436804 0.07690730 0.002487209
## G 0.014113224 0.029264555 0.0006849825 0.002890971 0.07360102 0.010225388
##             DH         DL         DN         DU         EB          EE
## B 0.0008462099 0.02238033 0.07771916 0.08718988 0.12030271 0.030727859
## D 0.0014691554 0.02686064 0.10602638 0.02984070 0.08588496 0.008289998
## G 0.0007064265 0.05782196 0.03370303 0.00471532 0.03048112 0.002127705
##             EG         EJB         EP           FE         FL         FR
## B 0.0008876063 0.003047294 0.02136469 4.335507e-05 0.03352788 0.07725206
## D 0.0016226893 0.003562378 0.01514142 8.332410e-05 0.05156255 0.07065163
## G 0.0020501798 0.001231634 0.03068324 1.272535e-04 0.01076221 0.01151692
##            GE         GH         GI
## B 0.010481071 0.05333599 0.01355872
## D 0.003285626 0.08323194 0.02565878
## G 0.075670336 0.09472909 0.05197198
## 
## Residual Deviance: 73.13537 
## AIC: 235.1354
pre <- predict(backward,data_test)
cfm <- table(pre,data_test_y);cfm
##    data_test_y
## pre   A   B   D   G
##   A 131   6   1   3
##   B  11  12   0   0
##   D   7   0   3   0
##   G   1   1   0   3
accuracy_backward <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1])
cat("accuracy_backward = ",accuracy_backward)
## accuracy_backward =  0.89375
summary(forward)
## Call:
## multinom(formula = data_train$Alpha ~ DU + DA + FR + CR + DN + 
##     BQ + GL + AH + DE + GE + EP + EB + EU + BC + GH + EE, data = data_train)
## 
## Coefficients:
##   (Intercept)         DU          DA         FR         CR         DN
## B    4.682964  0.7561841  0.01018583   1.098818  -4.655528 -0.3323161
## D   -9.048361 -2.0350796 -0.04119132   5.036299  -7.356278 -0.2441338
## G   81.448324 -2.0636695 -0.97051550 -12.905063 -44.084635  0.3871043
##           BQ         GL           AH            DE           GE          EP
## B 0.01703131 -0.1500480  0.005022831  0.0009211991 -0.001627710 -0.05950253
## D 0.04950972  0.2842291 -0.003914131 -0.0209829496 -0.009861206 -0.02774722
## G 0.09970033  0.1726239  0.086399276 -0.0944835055 -0.339928054 -0.18319126
##           EB           EU          BC         GH        EE
## B 0.37996872 -0.011063662 -0.01118276 0.03694755 -0.336095
## D 0.01887102 -0.008052917  0.17803962 0.40707536 -1.480014
## G 2.53705482 -0.701742097  0.16916784 0.38050246 -6.597329
## 
## Std. Errors:
##   (Intercept)        DU         DA        FR        CR         DN          BQ
## B   2.7436374 0.1462798 0.01744689 0.5583844 2.0329496 0.08588757 0.004801088
## D   0.2976726 1.1095092 0.03234176 1.7545024 3.3205925 0.11860693 0.016497411
## G   0.1700142 1.4638127 0.37145371 0.5702865 0.1978381 0.43829797 0.028673035
##          GL          AH           DE          GE         EP        EB
## B 0.1081546 0.005929525 0.0008624898 0.005564302 0.01986938 0.1195114
## D 0.1386095 0.013878825 0.0090824413 0.007246348 0.01550964 0.1803792
## G 0.2943235 0.036490391 0.0529723405 0.240794691 0.10388635 1.0495497
##            EU         BC         GH        EE
## B 0.009294198 0.05459873 0.03775399 0.1666173
## D 0.010829531 0.11618957 0.14723340 0.6890262
## G 0.163787820 0.11468202 0.55531816 0.7197902
## 
## Residual Deviance: 111.5319 
## AIC: 213.5319
pre <- predict(forward,data_test)
cfm <- table(pre,data_test_y);cfm
##    data_test_y
## pre   A   B   D   G
##   A 139   9   1   4
##   B   5   9   0   0
##   D   6   1   3   0
##   G   0   0   0   2
accuracy_forward <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1])
cat("accuracy_forward = ",accuracy_forward)
## accuracy_forward =  0.9135802
summary(both)
## Call:
## multinom(formula = data_train$Alpha ~ AF + AZ + BD + BN + BQ + 
##     CC + CD + CR + DF + DL + DN + DU + EB + EE + EG + EJ + EP + 
##     FE + FL + FR + FS + GE + GH + GI + DE + AR + EL, data = data_train)
## 
## Coefficients:
##   (Intercept)           AF         AZ          BD         BN         BQ
## B   -52.19231 -0.002738430  1.0517901 0.002717409  1.7758614 0.15131510
## D    10.76356  0.002548216  0.8816125 0.004672917 -1.6521730 0.05760206
## G     4.77426  0.004736048 -1.2745266 0.002432164  0.5756337 0.06821597
##           CC        CD        CR        DF          DL         DN        DU
## B  -3.078254 0.1251337 -40.38782 -6.435579  0.15452383 -1.9819764  2.498541
## D -38.296357 0.2513142 -29.58681 -8.503047 -0.28339661 -0.9480800 -1.083401
## G  -5.415913 0.2065938 -43.27801  4.801040 -0.07398478 -0.5881673 -0.377662
##           EB        EE            EG        EJB          EP            FE
## B  2.8909416  -3.11140 -0.0097629294  33.846157 -0.46142411  0.0005398319
## D -0.4351447 -10.77915  0.0005105633 -10.113623  0.07741537 -0.0002263650
## G  1.4650507 -10.48945  0.0003718619  -3.722985 -0.16776230  0.0002328027
##           FL        FR          FS          GE        GH         GI
## B  0.4288937 0.7852166  -7.9341139 -0.08451908 0.3795574 -0.0983592
## D  0.6218193 0.8717090 -12.5026309  0.02142876 1.4122365  0.1947040
## G -1.8206628 0.4377497   0.2036347 -0.06991261 1.0138528 -0.3671233
##             DE        AR          EL
## B  0.006877204 1.2797536  0.06507861
## D -0.057649649 0.7935923  0.03693092
## G -0.057207956 0.6464357 -0.05644303
## 
## Std. Errors:
##    (Intercept)           AF         AZ           BD         BN          BQ
## B 0.0028313560 0.0003589849 0.10496231 0.0004540265 0.11159847 0.006641279
## D 0.0015841326 0.0005351028 0.05870352 0.0008043464 0.02954792 0.016378400
## G 0.0008153339 0.0007635421 0.01043945 0.0007795174 0.02067840 0.042483617
##             CC         CD          CR           DF         DL         DN
## B 0.0020146387 0.02012005 0.004342250 0.0036695285 0.02208428 0.08825811
## D 0.0014754502 0.01740144 0.002186468 0.0011251243 0.04472954 0.07551673
## G 0.0005017136 0.04093165 0.001902048 0.0007820536 0.08359221 0.03185583
##            DU          EB          EE          EG         EJB         EP
## B 0.061761670 0.095643015 0.023268981 0.001114418 0.003107023 0.02743849
## D 0.008042835 0.020444141 0.009356635 0.002394862 0.003588963 0.01761702
## G 0.010027478 0.008841092 0.003698352 0.003705152 0.001686041 0.03728074
##             FE         FL          FR           FS          GE         GH
## B 3.858159e-05 0.04879107 0.007995403 0.0027299254 0.013740710 0.05524666
## D 1.865896e-04 0.06210654 0.005115232 0.0015967098 0.004939822 0.14105402
## G 6.529826e-04 0.01673498 0.003951860 0.0007658799 0.037119399 0.05992543
##           GI          DE         AR         EL
## B 0.01889665 0.001343329 0.05037193 0.01299289
## D 0.03374566 0.006480173 0.01742237 0.05242144
## G 0.11921980 0.024160938 0.01917646 0.10692962
## 
## Residual Deviance: 43.45446 
## AIC: 211.4545
pre <- predict(both,data_test)#predict(both,data_test,"probs")
cfm <- table(pre,data_test_y);cfm
##    data_test_y
## pre   A   B   D   G
##   A 135   7   1   2
##   B   9  12   1   0
##   D   5   0   0   1
##   G   1   0   2   3
accuracy_both <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1])
cat("accuracy_both = ",accuracy_both)
## accuracy_both =  0.9018405
  • SVM:e1071 package
    • Support Vector Machine,支持向量機。
    • 在高維或無限維空間中找到能最完美(max.margin)分類的超平面或超平面集合,無論資料是線性可分或線性不可分皆適用。但須注意在特徵遠大於樣本的情況下容易有過擬和問題。
    • 針對小樣本、非線性、高維度與局部最小點等問題具有相對的優勢。
tune <- tune.svm(data_train$Alpha ~ data_train$AB+data_train$AF+data_train$AH+
                   data_train$AM+data_train$AR+data_train$AX+data_train$AY+data_train$AZ+
                   data_train$BC+data_train$BD+data_train$BN+data_train$BP+data_train$BQ+
                   data_train$BR+data_train$CB+data_train$CC+data_train$CD+
                   data_train$CF+data_train$CH+data_train$CL+data_train$CR+
                   data_train$CU+data_train$CW+data_train$DA+data_train$DE+data_train$DF+
                   data_train$DH+data_train$DI+data_train$DL+data_train$DN+data_train$DU+
                   data_train$DY+data_train$EB+data_train$EE+data_train$EG+
                   data_train$EJ+data_train$EL+data_train$EP+data_train$EU+
                   data_train$FC+data_train$FE+data_train$FI+data_train$FL+
                   data_train$FR+data_train$FS+data_train$GB+data_train$GE+data_train$GF+
                   data_train$GH+data_train$GI+data_train$GL,data=data_train,
                 type="C-classification",kernel="radial",
                 range=list(gamma=2^(-1:1),cost=2^c(-8,-4,-2,0),
                            epsilon=seq(0,10,0.1)))#調整參數cost,epsilon,gamma
                   
svm <- svm(data_train$Alpha ~ .,data=data_train,cost = tune$best.model$cost,
           gamma = tune$best.model$gamma,epsilon = tune$best.model$epsilon,probability=TRUE)
summary(svm)
## 
## Call:
## svm(formula = data_train$Alpha ~ ., data = data_train, cost = tune$best.model$cost, 
##     gamma = tune$best.model$gamma, epsilon = tune$best.model$epsilon, 
##     probability = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  223
## 
##  ( 39 147 14 23 )
## 
## 
## Number of Classes:  4 
## 
## Levels: 
##  A B D G
pre <- predict(svm,data_test)
cfm <- table(pre,data_test_y);cfm
##    data_test_y
## pre   A   B   D   G
##   A 150  13   4   3
##   B   0   6   0   0
##   D   0   0   0   0
##   G   0   0   0   3
accuracy_svm <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1])
cat("accuracy_svm = ",accuracy_svm)
## accuracy_svm =  0.9230769
  • 集成學習模型架構
  • Ensemble learning
    使用多個具差異性且精度P>0.5的弱分類器,將其合成一個強分類器。
  • boosting
    每次訓練使用全部數據進行學習。先建立一弱分類器,將每次訓練錯誤的樣本進行加權,再對所有分類器進行迭代整合。
  • bagging
    取後放回進行抽樣,得到數個樣本子集,分別對其訓練生成數個分類器,再整合結果。若是回歸問題,使用將預測結果平均得到最終結果;分類問題,依據投票法得到分類結果。
  • stacking
    先建立多個不同的分類器,再將各個模型的預測值用於訓練下一層分類器。
  • adaboost : adabag package
    • Adaptive Boosting,自適應增強。
    • 步驟:使用全部數據集學習生成第一個分類器,根據第一個分類器預測結果重新定義數據集。將預測錯誤的樣本提高權重,重複所有步驟。得到n個分類器後,最終以所有分類器綜合投票結果得到Ada Boosting模型的學習成果。
    • 優點:
      • 具有很高的精度(low bias)。
      • 可使用多種不同算法作為弱分類器。
    • 缺點:
      • adaboost的迭代次數(分類器數目,n)設定不易,次數越多精度越高,所需計算時間也就越長。
      • 易受極端值(異常值)影響,可用交叉驗證找到較佳的n值。弱分類器較複雜時,提高迭代次數,測試誤差無明顯差別,及泛化能力沒有增強,有過擬和問題。
#AdaBoost algorithm with different numbers of classifiers
error <- as.numeric()
for(i in 1:100){
  adaboost_fit <- boosting.cv(Alpha ~ .,data=data_train, boos=TRUE, mfinal=i) 
  #v:v folds,預設為10
  error[i] <- adaboost_fit$error
}
best.iter <- which(error == min(error))[1]

plot(error,type = "l")
points(which(error == min(error))[1],min(error),col = "pink",pch=19,cex=1.2)
text(which(error == min(error))[1]+5,min(error)+0.001,round(min(error),6),cex=0.7)
abline(h=min(error),col="red")
text(20,min(error)+0.001,"best.iter = 51",min(error),cex=0.7)

ada.model <- boosting(Alpha ~ .,data=data_train, boos=TRUE, mfinal=best.iter)
# boos(預設:TRUE)每次重新計算權重,反之,每次使用相同權重.
# mfinal:迭代最大次數.

cat("best_mfinal = ",best.iter)
## best_mfinal =  51
ada.predict <- predict(ada.model,data_test)
cfm <- table(ada.predict$class,data_test_y);cfm
##    data_test_y
##       A   B   D   G
##   A 149   3   2   3
##   B   1  15   1   0
##   D   0   0   1   0
##   G   0   1   0   3
accuracy_ada <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1])
cat("accuracy_adaboost = ",accuracy_ada)
## accuracy_adaboost =  0.9761905
  • KNN:class package
    • 步驟:找到距離最近的K個鄰居→進行投票→決定類別。
    • 在資料為連續型變數前提下,以歐式距離作為距離度量。依據測試點周圍的k個樣本(以距離作為判斷準則)的類別出現頻繁次數,推測該樣本類別。
    • 如何選擇一個最佳的K值取決於資料。一般情況下,在分類時較大的K值能夠減小雜訊的影響,但會使類別之間的界限變得模糊;而K值太小,卻又容易受到噪音的影響。經驗法則是盡量讓K值低於樣本數的平方根。實務上,也可透過不斷拆分訓練集與測試集的交叉驗證找到較為穩定的K值。
data_train_knn <- data_train[,-36] #chr
data_test_knn <- data_test[,-36] #chr

knn <- knn(data_train_knn[,-51],data_test_knn,data_train_knn[,51],k=7)#k建議設為奇數,通常k的上限為訓練樣本數的20%。
cfm <- table(knn,data_test_y)
accuracy_knn <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1])

for (i in 1:round(0.2*nrow(data_train))) {
  knn <- knn(data_train_knn[,-51],data_test_knn[,-51],data_train_knn[,51],k=i)#k建議設為奇數,通常k的上限為訓練樣本數的20%。
  cfm <- table(knn,data_test_y)
  accuracy_knn[i] <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1]);accuracy_knn
}
accuracy_knn <- as.data.frame(accuracy_knn)
accuracy_knn$col <- rep("1",nrow(accuracy_knn))
accuracy_knn$col[which.max(accuracy_knn$accuracy_knn)] <- "2"
plot(1:round(0.2*nrow(data_train)),accuracy_knn$accuracy_knn,xlab="k",col=accuracy_knn$col)

n <- which.max(accuracy_knn$accuracy)
cat("BEST k = ",n)
## BEST k =  10
knn <- knn(data_train_knn[,-51],data_test_knn[,-51],data_train_knn[,51],k=n)#k建議設為奇數,通常k的上限為訓練樣本數的20%。
cfm <- table(knn,data_test_y);cfm
##    data_test_y
## knn   A   B   D   G
##   A 150  15   4   5
##   B   0   3   0   1
##   D   0   0   0   0
##   G   0   1   0   0
accuracy_knn <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1])
cat("accuracy_knn = ",accuracy_knn)
## accuracy_knn =  0.9107143
  • 決策樹,CART:caret package
    • 步驟:在決策樹的每一個節點上採二分法,不斷地往下拓展,直至層數達到設定的最大深度為止。
    • 優點:
      • 簡單且具高解釋性。
      • 計算時間快速。
    • 缺點:
      • 容易有過擬和問題,高方差低偏差。
      • 當類別很多時,樹容易過度複雜。
cart.model <- rpart(Alpha ~. ,data=data_train ,method="class")

prp(cart.model,         # 模型
    fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
    extra=2)# number of correct classifications / number of observations in that node

cart.predict <- predict(cart.model,data_test,type="class")
cfm <- table(cart.predict,data_test_y);cfm
##             data_test_y
## cart.predict   A   B   D   G
##            A 142   5   2   5
##            B   6  12   0   0
##            D   0   1   1   0
##            G   2   1   1   1
accuracy_cart <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1]);accuracy_cart
## [1] 0.9333333
#k-fold cross-validation:避免overfitting.
#library(caret)
train_control <- trainControl(method="cv", number=10)
train_control.model <- train(Alpha ~., data=data_train, method="rpart", trControl=train_control);train_control.model
## CART 
## 
## 417 samples
##  51 predictor
##   4 classes: 'A', 'B', 'D', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 375, 375, 376, 377, 375, 375, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05263158  0.8584393  0.4879569
##   0.06578947  0.8488574  0.4438787
##   0.25000000  0.8178996  0.1237989
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05263158.
  • GBDT : gbm package
    • Gradient boosting,梯度提升法。GBDT,迭代的決策樹算法,由多棵子決策樹累加得出預測值。
    • 步驟:最初以常數作為預測值,之後在每次預測時加入新的學習函數。在函數空間進行殘差計算,當殘差>0,繼續進行運算,使其損失函數最小。每次基於上一輪殘差進行迭代,目的為降低模型偏差,提高精度。
    • 優點:
      • 無論離散型或連續型資料皆可,適用於迴歸與分類問題。
      • 充分考慮了每個弱分類器的權重。
      • 因使用了決策樹,可解釋性較強。
    • 缺點:
      • 由於弱分類器的串行依賴,導致難以並行訓練數據。
    • 參數:
      1. distribution:loss function,i.e. “gaussian”(squared error),“laplace” (absolute loss),“bernoulli”(logistic regression for 0-1 outcomes), “huberized”(huberized hinge loss for 0-1 outcomes)等.
      2. n.trees:迭代次數
      3. shrinkage:學習速率,步子邁得太大容易不斷在最小值左右徘徊,因此學習速率越小越好,但步子若太小,步數就得增加,即訓練的迭代次數需加大才能使模型達到最優。如此訓練所需時間和計算資源也相應加大,gbm作者經驗法則是設定參數在0.01-0.001之間。
      4. bag.fraction:再抽樣比率.
gbm <- gbm(data_train$Alpha~.,data_train, distribution = "multinomial", 
           n.trees=1000, n.minobsinnode = 10, cv.folds = 5)
best.iter <- gbm.perf(gbm,method="cv")

gbm.predict <- predict(gbm,data_test)
## Using 59 trees...
head(gbm.predict)
## , , 59
## 
##               A          B         D         G
## [1,] -0.6552952  2.0506014 -2.816112 -3.320289
## [2,]  1.9154860 -2.3490930 -2.282791 -2.478012
## [3,]  0.8739882 -2.2969251 -2.768669 -3.332144
## [4,]  3.3623642 -2.5459188 -3.638649 -3.277983
## [5,]  2.4832357  0.3145237 -2.820418 -2.036101
## [6,]  2.5207155 -3.3701377 -3.148828 -3.727783
prob.gbm.predict <- as.data.frame(apply(gbm.predict, 1, which.max))
colnames(prob.gbm.predict) <- "gbm.predict"

prob.gbm.predict <- prob.gbm.predict %>%
  mutate(gbm.predict = gbm.predict) %>%
  mutate(gbm.predict = gsub("1","A",gbm.predict)) %>%
  mutate(gbm.predict = gsub("2","B",gbm.predict)) %>%
  mutate(gbm.predict = gsub("3","D",gbm.predict)) %>%
  mutate(gbm.predict = gsub("4","G",gbm.predict))

cfm <- table(prob.gbm.predict$gbm.predict,data_test_y);cfm
##    data_test_y
##       A   B   D   G
##   A 144   4   1   4
##   B   6  13   1   0
##   D   0   1   1   0
##   G   0   1   1   2
accuracy_GB <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1])
cat("accuracy_GB = ",accuracy_GB)
## accuracy_GB =  0.9401198
  • XGboost : xgboost package
    • eXtreme Gradient Boosting,極限梯度提升法。
    • 與GBDT方法相似,但對樹的結構加入懲罰項L1(Lasso)與L2(Ridge)進行正則化約束,防止模型過度複雜,降低過擬和發生的可能性。
    • 參數:
      1. data須為matrix,建議使用xgb.DMatrix.
      2. objective[默認reg:linear] ,此參數定義需要被最小化的損失函数。
        常用參數有: reg:linear:線性迴歸
        reg:logistic:羅吉斯迴歸
        binary:logistic:二元分類的羅吉斯迴歸,返回預測的概率
        binary:logitraw:二元分類的羅吉斯迴歸,返回logit變換前的值
        multi:softmax:多元分類問題,需要设置num_class(分類的個數)
        multi:softprob:和softmax一样,但返回的是每個數據各類別的概率。
      3. eval_metric[默認值取于objective參數的取值],對於有效數據的度量方法。對於迴歸問題而言,默认值是rmse,對於分類問題,默认值是error。
        典型值有:
        rmse:均方根誤差
        mae:平均絕對誤差
        logloss:負對數似然函數值
        error:二分類錯誤率(阈值為0.5)
        merror:多分類錯誤率
        mlogloss:多分類logloss損失函数
        auc:曲線下面積
data_train_Matrix <- data_train
data_train_Matrix[,52] <- as.numeric(factor(data_train_Matrix[,52]))-1  #Label encoding,A:0,B:1.D:2,G:3
data_train_Matrix <- xgb.DMatrix(data = as.matrix(data_train_Matrix[,-c(36,52)]), label = data_train_Matrix[,52])
data_test_Matrix <- data_test
data_test_Matrix[,52] <- data_test_y  #Label encoding,A:0,B:1.D:2,G:3
data_test_Matrix <- data_test_Matrix%>%
  mutate(V52 = V52) %>%
  mutate(V52 = gsub("A",0,V52)) %>%
  mutate(V52 = gsub("B",1,V52)) %>%
  mutate(V52 = gsub("D",2,V52)) %>%
  mutate(V52 = gsub("G",3,V52))
data_test_y_Label <- data_test_Matrix[,52]
data_test_Matrix <- xgb.DMatrix(data = as.matrix(data_test_Matrix[,-c(36,52)]), label = data_test_Matrix[,52])
xgb.params = list(colsample_bytree = 0.5,#col的抽樣比例,越高表示每棵樹使用的col越多,會增加每棵小樹的複雜度
  subsample = 0.5,# row的抽樣比例,越高表示每棵樹使用的col越多,會增加每棵小樹的複雜度
  booster = "gbtree",
  max_depth = 2,# 樹的最大深度,越高表示模型可以長得越深,模型複雜度越高           
  eta = 0.03,# boosting會增加被分錯的資料權重,而此參數是讓權重不會增加的那麼快,因此越大會讓模型愈保守
  eval_metric = "merror",                    
  objective = "multi:softmax",
  num_class = 4,
  gamma = 0# 越大,模型會越保守,相對的模型複雜度比較低
  )     
cv.model <- xgb.cv(params=xgb.params,data=data_train_Matrix,
                  nfold = 5,#5-fold cv
                  nrounds = 200,#測試1-100,各個樹總數下的模型;
                  #如果當nrounds<30時,就已經有overfitting情況發生,那表示不用繼續tune下去了,可以提早停止
                  early_stopping_rounds = 30, 
                  print_every_n = 20)#每20個單位才顯示一次結果
## [1]  train-merror:0.122910+0.012522  test-merror:0.179873+0.007728 
## Multiple eval metrics are present. Will use test_merror for early stopping.
## Will train until test_merror hasn't improved in 30 rounds.
## 
## [21] train-merror:0.090536+0.010541  test-merror:0.155977+0.014199 
## Stopping. Best iteration:
## [7]  train-merror:0.102529+0.009093  test-merror:0.148746+0.022575
xgb <- xgb.train(params=xgb.params,
               data_train_Matrix,
               nrounds = cv.model$best_iteration)
xgb.predict <- gsub(3,"G",gsub(2,"D",gsub(1,"B",gsub(0,"A",predict(xgb,data_test_Matrix)))))
cfm <- table(xgb.predict,data_test_y);cfm
##            data_test_y
## xgb.predict   A   B   D   G
##           A 150  12   3   6
##           B   0   7   1   0
accuracy_XGB <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1]);accuracy_XGB
## [1] 0.9289941
cat("accuracy_XGB = ",accuracy_XGB)
## accuracy_XGB =  0.9289941
  • bagging : ipred package
    • 步驟:從訓練資料中隨機抽取樣本,訓練一個分類器,將樣本放回(n<N)。重複步驟多次,每次抽取相同n的樣本數,產生多個分類器。每個分類器的權重一致,最後將之組合得到最終結果。  
    • 將多個高方差或高偏差的弱分類器聚集成一個強分類器,藉此降低方差(variance)或減少偏差(bias)。 
    • 若訓練資料中有噪音,透過bagging有機會減少噪音資料被抽取,降低模型不穩定性。
bag.model <- bagging(Alpha ~. ,data=data_train ,
                     coob = TRUE) # Use the OOB sample to estimate the test error
bag.predict <- predict(bag.model,data_test)
cfm <- table(bag.predict,data_test_y);cfm
##            data_test_y
## bag.predict   A   B   D   G
##           A 140   3   2   5
##           B  10  15   1   0
##           D   0   0   0   0
##           G   0   1   1   1
accuracy_bag <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1]);accuracy_bag
## [1] 0.922619
cat("accuracy_bagging = ",accuracy_bag)
## accuracy_bagging =  0.922619
  • randon forest : randomForest package
    • 步驟:隨機抽取部分樣本與部分特徵建立一棵樹,重複步驟得到多棵樹的分類器,最後將之組合得到最終結果。
    • 決策樹的改進模型,除了可提高準確率,同時改進決策樹容易出現過擬和的問題。
randomForest.model <- randomForest(Alpha ~ .,data=data_train)
randomForest.predict <- predict(randomForest.model,data_test)
cfm <- table(randomForest.predict,data_test_y);cfm
##                     data_test_y
## randomForest.predict   A   B   D   G
##                    A 150   8   2   4
##                    B   0  11   1   0
##                    D   0   0   0   0
##                    G   0   0   1   2
accuracy_randomForest <- (cfm[1,1]+cfm[2,2])/(cfm[1,1]+cfm[2,2]+cfm[1,2]+cfm[2,1]);accuracy_randomForest
## [1] 0.9526627
cat("accuracy_randomForest = ",accuracy_randomForest)
## accuracy_randomForest =  0.9526627